MDS clustering

Pierre Guilmin

November 2018

Requirements

Built with R version 3.5.1

  • Nicola Roberts HDP R package (GitHub link) which implements the hierarchical Dirichlet process:
    devtools::install_github("nicolaroberts/hdp", build_vignettes = TRUE)
    
  • Various R libraries:
    • ggplot2
    • reshape2
    • gridExtra
    • tibble
    • heatmap3
    • prodlim
    • survival
    • RColorBrewer
    • IRdisplay

In [1]:
library('ggplot2')
library('reshape2')
library('gridExtra')
library('tibble')
library('heatmap3')
library('prodlim')
library('survival')
library('RColorBrewer')
library('IRdisplay')

source('utils/tools.R')     # custom tools function
source('utils/hdp_tools.R') # hdp related functions

theme_set(theme_minimal())

# set jupyer notebook parameters
options(repr.plot.res        = 100, # set a medium-definition resolution for the jupyter notebooks plots (DPI)
        repr.matrix.max.rows = 200, # set the maximum number of rows displayed
        repr.matrix.max.cols = 100) # set the maximum number of columns displayed
Run citation('hdp') for citation instructions,
    and file.show(system.file('LICENSE', package='hdp')) for license details.

Get data

In [2]:
# get cytogenetics data
dd_cyto <- impact <- read.table("data/dd_cyto_cut15.tsv", sep = "\t", stringsAsFactors = FALSE, header = TRUE)

# get mutation data
dd_mutation <- read.table("data/dd_mutation_hotspot_cut10.tsv", sep = "\t", stringsAsFactors = FALSE, header = TRUE)

# merge cytogenetics and mutation data
cat(all(rownames(dd_mutation) == rownames(dd_cyto)), '\n\n') # check if same row ordering
dd_all <- cbind(dd_mutation, dd_cyto)

print_size(dd_all)
head(dd_all, 3)
TRUE 

Size of dd_all: 3300 x 157
ARID1AARID2ASXL1ASXL2ATRXBCORBCORL1BRAFBRCC3CALRCBLCDKN1BCDKN2ACEBPACREBBPCSF1RCSF3RCSNK1A1CTCFCUX1DDX23DDX4DDX41DDX54DHX33DICER1DNMT3ADNMT3BEEDEGFREP300ETNK1ETV6EZH2FAM175AFLT3GATA1GATA2GNASGNB1HIPK2IDH1IDH2_140IDH2_172IRF1JAK2JARID2KDM5CKDM6AKIT⋯del5qplus8del7delYdel20qmardel7qdel12pdel11qdel18del17pdel5plus21del17del13del3pplus1qdel16del20plus11qplus1pdel16qdel12del1pplus19del13qdel21del9qplus1plus22plus17qdel4qplus11plus14del3del3qplus17pr_3_3del11pdel14del15delXplus9r_9_9plus13plus15r_1_7del22ringWGA
I-H-132697-T1-1-D1-100100000000000001000000000100000000100000000000000⋯00001000000000000000000000000000000000000000000000
I-H-132698-T1-1-D1-100100000000000100000000000000000000000000000000000⋯00000000000000000000000000000000000000000000000000
I-H-116889-T2-1-D1-100000000000000000000000000000000010000000000000000⋯00000000000000000000000000000000000001000000000000

164 patients don't have any event:

In [3]:
nrow(dd_all[apply(dd_all, 1, function(x) sum(x)) == 0,])
164

Run HDP

We run HDP using multiple independent posterior sampling chains and a initial number of raw cluster of 15 (initcc = 15).
All the functions used are defined in utils/hdp_tools.R and make use of Nicola Roberts HDP R package (see notebook header).
To follow the HDP terminology, in the rest of the notebook the term component is used for cluster and the term category is used for one gene/cytogenetic feature (ie we have 157 categories).

In [4]:
# initialise hdp
hdp <- initialise_hdp(dd_all)
Initialise HDP on a 3300 x 157 dataframe
  → create HDP structure... done!
  → add DP node for each patient... done!
  → assign the data to the nodes... done!
In [5]:
# run multiple independent posterior sampling chains with different random seeds

number_of_chains <- 3
chain_list <- vector('list', number_of_chains)

for (i in 1:number_of_chains) {
    seed <- i * 100
    print_and_flush(sprintf('### Experiment %d (seed = %d) ###\n', i, seed))
    
    # run single hdp chain
    chain_list[[i]] <- activate_and_run_hdp(hdp,
                                            initcc = 15,
                                            burnin = 1000,
                                            n      = 300,
                                            space  = 20,
                                            seed   = seed)
    print_and_flush('\n')
}

multi_output <- hdp_multi_chain(chain_list)
multi_output
### Experiment 1 (seed = 100) ###
Activate HDP nodes and run posterior sampling
  → activate HDP nodes... done!
  → run posterior sampling...
[1] "1000 burn-in iterations in 0.2 mins"
[1] "time 1.2 ETC 1.2 mins"

### Experiment 2 (seed = 200) ###
Activate HDP nodes and run posterior sampling
  → activate HDP nodes... done!
  → run posterior sampling...
[1] "1000 burn-in iterations in 0.2 mins"
[1] "time 1.2 ETC 1.3 mins"

### Experiment 3 (seed = 300) ###
Activate HDP nodes and run posterior sampling
  → activate HDP nodes... done!
  → run posterior sampling...
[1] "1000 burn-in iterations in 0.2 mins"
[1] "time 1.2 ETC 1.2 mins"

Object of class hdpSampleMulti 
 Number of chains: 3 
 Total posterior samples: 900 
 Components: NO. Run hdp_extract_components 
 ----------
 Final hdpState from first chain: 
Object of class hdpState 
 Number of DP nodes: 3301 
 Index of parent DP: 0 1 1 1 1 1 1 1 1 1 ...
 Number of data items per DP: 0 9 3 5 10 1 0 4 3 6 ...
 Index of conparam per DP: 1 1 1 1 1 1 1 1 1 1 ...
 Conparam hyperparameters and current value:
           Shape Rate    Value
Conparam 1     1    1 1.602232
 Number of data categories: 157 
 Number of clusters: 14 
 Initialised with 15 clusters, using random seed 100 
In [6]:
# assess quality of posterior sampling chain
for (experiment in chains(multi_output))
    plot_posterior_sampling_chain_quality(experiment, 15, 5)
In [7]:
# extract and plot components
multi_output <- extract_components(multi_output)
plot_components_size(multi_output)
Extract HDP components from posterior sampling
  → extract components... done!
* 11 components found

WARNING: component 0 is a "garbage" component, not a real component.

In [8]:
plot_category_distribution_by_component(multi_output, colnames(dd_all))
In [9]:
# get top 7 categories for each component
get_top_categories_by_component(multi_output, colnames(dd_all), top_number = 7)
component_0component_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11
ASXL1 TET2 TP53 ASXL1 SF3B1 ASXL1 del5q ASXL1 U2AF1_34 DNMT3A plus21 plus1q
U2AF1_157SRSF2 del5q SRSF2 TET2 SETBP1 SF3B1 EZH2 BCOR NPM1 plus8 del7q
TET2 ASXL1 mar STAG2 DNMT3A U2AF1_157DNMT3A TET2 DNMT3A FLT3 plus9 r_1_7
KMT2D SF3B1 del7 RUNX1 delY del7 TP53 RUNX1 del20q WT1 plus14 ETNK1
r_9_9 CBL del7q TET2 KMT2C ETV6 plus8 ZRSR2 BCORL1 PTPN11 TP53 plus1
PHF6 CUX1 del12p IDH2_140 KMT2D RUNX1 CSNK1A1 U2AF1_157RUNX1 NRAS plus19 ZMYM3
STAT3 DDX41 del18 BCOR ZBTB33 PTPN11 TET2 PHF6 NRAS KDM6A plus1 GATA1

Study HDP components

Let's get the probability prediction for each patient and each component. Notice that the 164 patients with no mutation/cytogenetics are NA rows.

In [10]:
dd_predicted <- get_prediction_result_dataframe(multi_output, dd_all)
head(dd_predicted)
Number of components: 11
Number of NA rows   : 164
component_0component_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11predicted_componentmax_proba
0.01691358 0.04098765 0.009382716 0.07901235 0.16148148 0.37283951 0.07950617280.08716049 0.101728395 0.05086420 0 0.00012345685 0.3728395
0.11259259 0.03074074 0.000000000 0.16444444 0.06370370 0.11111111 0.00074074070.48851852 0.014814815 0.01333333 0 0.00000000007 0.4885185
0.01044444 0.02488889 0.002888889 0.02133333 0.40977778 0.36800000 0.02444444440.07177778 0.008444444 0.05777778 0 0.00022222224 0.4097778
0.01444444 0.11433333 0.071888889 0.04766667 0.04400000 0.03044444 0.00333333330.64288889 0.006555556 0.01188889 0 0.01255555567 0.6428889
0.02555556 0.19777778 0.000000000 0.33333333 0.02555556 0.18222222 0.00000000000.23555556 0.000000000 0.00000000 0 0.00000000003 0.3333333
NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaNNaN NaN
In [11]:
n_components <- ncol(dd_predicted) - 2 - 1 # without columns 'component_0', 'predicted_component' and 'max_proba'
n_components
11

Component size in the dataset:

In [12]:
get_table(dd_predicted[,'predicted_component'])
valuescountfreq
1 665 20.2%
4 654 19.8%
3 463 14%
2 355 10.8%
6 287 8.7%
5 243 7.4%
7 205 6.2%
NaN 164 5%
8 151 4.6%
9 75 2.3%
10 20 0.6%
11 18 0.5%
-- total --3300 100%
In [13]:
plot_continous_feature_by_components <- function(data, feature_name, scale_y) {    
    set_notebook_plot_size(25, 7)
    
    myLabeller <- function(x) {
        x$predicted_component <- apply(x, 1, function(s) sprintf('Component %s (n = %d)', s, nrow(data[data$predicted_component == s,])))
        return (x)
    }
        
    n_initial <- nrow(data)
    data <- data[!is.na(data[feature_name]),]
                                       
    p1 <- ggplot(data) + geom_boxplot(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7, notch = TRUE) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
              ggtitle(sprintf('%s density by component\n(removed %d NA values)', feature_name, n_initial - nrow(data))) +  theme(plot.title = element_text(size = 18, face = "bold"))
                                       
    p2 <- ggplot(data) + geom_violin(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
              ggtitle(' \n ') +  theme(plot.title = element_text(size = 18, face = "bold"))

    p3 <- ggplot(data) + geom_density(aes_string(feature_name, fill = 'predicted_component'), alpha = 0.7) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
              facet_wrap(~ predicted_component, ncol = 4, labeller = myLabeller) +
              ggtitle(' \n ') +  theme(plot.title = element_text(size = 18, face = "bold"))
                                       
    if (scale_y) {
        p1 <- p1 + scale_y_sqrt()
        p2 <- p2 + scale_y_sqrt()
    }
                                       
    grid.arrange(p1, p2, p3, ncol = 3)
}

#suppressWarnings(plot_continous_feature_by_components(dd_clinical, 'HB', TRUE))
In [14]:
source('utils/hdp_tools.R')
In [15]:
plot_assignement_probability_by_component(dd_predicted)
In [16]:
plot_assignement_probability_distribution_by_component(dd_predicted, 25, 10)

Components heatmap

In [17]:
# merge patient information dataframe with predicted probability and component
dd_clustered <- cbind(dd_all, dd_predicted)
dd_clustered <- rownames_to_column(dd_clustered, var = 'patient_key')

# sort by decreseasing component size and increasing assignemnt probability
categories_table <- get_table(dd_clustered[,'predicted_component'], add_total_count = FALSE)
dd_clustered$predicted_component <- factor(dd_clustered$predicted_component, levels = categories_table$values)
dd_clustered <- dd_clustered[order(dd_clustered$predicted_component, dd_clustered$max_proba),]

print_size(dd_clustered)
head(dd_clustered, 3)
Size of dd_clustered: 3300 x 172
patient_keyARID1AARID2ASXL1ASXL2ATRXBCORBCORL1BRAFBRCC3CALRCBLCDKN1BCDKN2ACEBPACREBBPCSF1RCSF3RCSNK1A1CTCFCUX1DDX23DDX4DDX41DDX54DHX33DICER1DNMT3ADNMT3BEEDEGFREP300ETNK1ETV6EZH2FAM175AFLT3GATA1GATA2GNASGNB1HIPK2IDH1IDH2_140IDH2_172IRF1JAK2JARID2KDM5CKDM6A⋯del13del3pplus1qdel16del20plus11qplus1pdel16qdel12del1pplus19del13qdel21del9qplus1plus22plus17qdel4qplus11plus14del3del3qplus17pr_3_3del11pdel14del15delXplus9r_9_9plus13plus15r_1_7del22ringWGAcomponent_0component_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11predicted_componentmax_proba
1007E-H-110762-T1-1-D1-10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ⋯ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.01916667 0.1994444 0.07333333 0.11666667 0.14055556 0.16805556 0.060555556 0.05444444 0.0075000000 0.0319444444 0.09805556 0.03027778 1 0.1994444
941E-H-102643-T1-1-D1-10 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ⋯ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.01185185 0.2118519 0.19814815 0.17888889 0.07888889 0.03777778 0.135185185 0.13148148 0.0007407407 0.0000000000 0.01518519 0.00000000 1 0.2118519
936E-H-102628-T1-1-D1-10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ⋯ 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.03634921 0.2134921 0.04238095 0.06222222 0.20238095 0.10476190 0.007460317 0.15269841 0.0287301587 0.0004761905 0.05936508 0.08968254 1 0.2134921
In [18]:
# compute column separators position
col_sep <- c(cumsum(categories_table$count))
col_sep <- col_sep[-length(col_sep)] # remove last separator

# compute row separators position
row_sep <- c(58, 6, 15, 8, 11, 7, 2, 50)
row_sep_cum <- c(cumsum(row_sep))
row_sep_cum <- row_sep_cum[-length(row_sep_cum)]

# compute row colors vector
sep_color <- rev(c(rgb(180/255, 142/255, 84/255),
                   rgb(146/255, 187/255, 105/255),
                   rgb(233/255, 72/255 , 157/255),
                   rgb(72/255 , 169/255, 137/255),
                   rgb(226/255, 146/255, 102/255),
                   rgb(140/255, 137/255, 190/255),
                   rgb(238/255, 198/255, 116/255),
                   rgb(0/255, 0/255, 0/255)))
row_colors <- c()
for (i in 1:length(row_sep))
    row_colors <- c(row_colors, rep(sep_color[i], row_sep[i]))
In [19]:
# manual gene grouping
col_order <- c(
'ARID1A', 'ARID2', 'BCORL1', 'BRCC3', 'CALR', 'CDKN1B', 'CDKN2A', 'CEBPA', 'CREBBP', 'CSF1R', 'CSF3R', 'CSNK1A1', 'DDX23', 'DDX4', 'DDX41',
'DDX54', 'DHX33', 'DICER1', 'DNMT3B', 'EED', 'EGFR', 'ETNK1', 'FAM175A', 'HIPK2', 'JAK2', 'JARID2', 'KDM5C', 'KMT2C', 'KMT2D', 'LUC7L2',
'MGA', 'MPL', 'NFE2', 'NIPBL', 'NOTCH2', 'NPM1', 'PAPD5', 'PHIP', 'PIK3CA', 'PRPF40B', 'PTPRF', 'RAD50', 'RASGRF1', 'RB1', 'ROBO1', 'ROBO2',
'RRAS', 'SETD2', 'SMG1', 'SPRED2', 'SRCAP', 'STAT3', 'SUZ12', 'TERT', 'YLPM1', 'ZBTB33', 'ZMYM3', 'ZNF318',

'ASXL1', 'BCOR', 'EZH2', 'RAD21', 'STAG2', 'ASXL2',

'ATRX', 'CUX1', 'GNAS', 'IRF1', 'KDM6A', 'KMT2A', 'NOTCH1', 'PHF6', 'SETBP1', 'WT1', 'SH2B3', 'SMC1A', 'EP300', 'SMC3', 'GNB1',

'BRAF', 'CBL', 'FLT3', 'KIT', 'KRAS', 'NF1', 'NRAS', 'PTPN11',

'DNMT3A', 'IDH1', 'IDH2_140', 'IDH2_172', 'TET2', 'ETV6', 'GATA1', 'GATA2', 'RUNX1', 'CTCF', 'MYC',

'SF3B1', 'SRSF2', 'U2AF1_157', 'U2AF1_34', 'U2AF2', 'ZRSR2', 'PRPF8',

'PPM1D', 'TP53',

'del1p', 'del3', 'del3q', 'del4q', 'del5', 'del5q', 'del7', 'del7q', 'del9q', 'del11p', 'del11q', 'del12', 'del12p', 'del13', 'del13q',
'del3p', 'del14', 'del15', 'del16', 'del16q', 'del17', 'del17p', 'del18', 'del20', 'del20q', 'del21', 'del22', 'delX', 'delY',
'plus1', 'plus1p', 'plus1q', 'plus8', 'plus9', 'plus11', 'plus11q', 'plus13', 'plus14', 'plus15', 'plus17p', 'plus17q', 'plus19', 'plus21', 'plus22',
'ring', 'mar', 'WGA', 'r_3_3', 'r_9_9', 'r_1_7'
)
length(col_order) # check that we have all 157 categories
157
In [20]:
# set a high-definition resolution for this plot (DPI)
options(repr.plot.res = 200)
set_notebook_plot_size(13, 13)

# plot the heatmap
heatmap3(t(as.matrix(dd_clustered[,col_order])),
         
         # no dendogram, no scaling, small legend
         Rowv = NA, Colv = NA, labCol = FALSE, scale = 'none', legendfun = function() showLegend(legend=c('Presence', 'Absence'), col=c('steelblue', 'lightgrey')),
         col = c('lightgrey', 'steelblue'),
        
         # set row colors
         RowSideColors = row_colors,
         RowAxisColors = 1,
         
         # add horizontal and vertical lines (+ 0.5 to be at the exact separation)
         add.expr = abline(v = col_sep + 0.5,
                           h = row_sep_cum + 0.5,
                           col = 'white', lwd = 1, lty = 'dotted'),
         
         # row label size
         cexRow = 0.4)

Categories distribution by component

In [21]:
categories_table
valuescountfreq
1 665 20.2%
4 654 19.8%
3 463 14%
2 355 10.8%
6 287 8.7%
5 243 7.4%
7 205 6.2%
NaN 164 5%
8 151 4.6%
9 75 2.3%
10 20 0.6%
11 18 0.5%
In [22]:
# create a dataframe showing the count by component for each category

categories_repartition <- data.frame(category = colnames(dd_all))

for (i in 1:n_components)
    categories_repartition[sprintf('component_%d', i)] <- apply(categories_repartition, 1, function(s) sum(dd_clustered[dd_clustered$predicted_component == i, s['category']]))
                   
# sort categories_repartition by most frequent genes
categories_repartition['total_count'] <- rowSums(categories_repartition[,-1])
categories_repartition <- categories_repartition[order(categories_repartition$total_count, decreasing = TRUE),]
head(categories_repartition)
categorycomponent_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11total_count
97TET2 412 32 149 234 38 43 83 21 11 5 0 1028
3ASXL1 169 18 298 72 125 31 164 22 1 2 3 905
85SF3B1 101 16 42 410 23 114 10 15 5 0 0 736
92SRSF2 202 8 274 21 48 2 4 1 4 3 2 569
27DNMT3A 35 44 37 213 29 73 8 49 36 3 3 530
108del5q 21 217 12 11 13 166 2 3 7 9 0 461
In [23]:
# plot category distribution colored by component count

# transform (extand) categories_repartition in a long data frame by gathering the components columns in one column
long_categories_repartition <- melt(categories_repartition, id = c('category', 'total_count'))
# sort long_gene_repartition by most frequent genes
long_categories_repartition$category <- factor(long_categories_repartition$category, levels = categories_repartition$category)
long_categories_repartition <- long_categories_repartition[order(long_categories_repartition$category),]

set_notebook_plot_size(25, 7)
# count plot
ggplot(long_categories_repartition) +
    geom_bar(aes(category, value, fill = variable), stat = 'identity') +
    tilt_x_label(70) +
    scale_fill_brewer(palette = 'Spectral', na.value = 'grey')

# proportion plot
ggplot(long_categories_repartition) +
    geom_bar(aes(category, value, fill = variable), stat = 'identity', position = 'fill') +
    tilt_x_label(70) +
    scale_fill_brewer(palette = 'Spectral', na.value = 'grey')
In [24]:
# plot category distribution by component

# sort long_gene_repartition by heatmap gene order
long_categories_repartition$category <- factor(long_categories_repartition$category, levels = col_order)
long_categories_repartition <- long_categories_repartition[order(long_categories_repartition$category),]

# plot category distribution by component
set_notebook_plot_size(25, 40)
ggplot(long_categories_repartition) + geom_bar(aes(category, value, fill = variable), stat='identity', alpha = 0.8) + tilt_x_label(70) + facet_wrap(~ variable, ncol = 1, scales = 'free')
In [25]:
# save dd_clustered
write.table(dd_clustered, file = 'data/dd_clustered.tsv', sep = '\t')

Comutation plots

From Elsa:

Clinical correlations

In [26]:
# get clinical data
dd_clinical <- read.table("data/df_clinical_selected.tsv", sep = "\t", stringsAsFactors = FALSE, header = TRUE)

# merge with clustering data (that already contains mutation and cytogenetics data)
dd_clinical <- merge(dd_clustered, dd_clinical, by.x = 'patient_key', by.y = 'LEUKID')

print_size(dd_clinical)
head(dd_clinical, 2)
Size of dd_clinical: 3300 x 235
patient_keyARID1AARID2ASXL1ASXL2ATRXBCORBCORL1BRAFBRCC3CALRCBLCDKN1BCDKN2ACEBPACREBBPCSF1RCSF3RCSNK1A1CTCFCUX1DDX23DDX4DDX41DDX54DHX33DICER1DNMT3ADNMT3BEEDEGFREP300ETNK1ETV6EZH2FAM175AFLT3GATA1GATA2GNASGNB1HIPK2IDH1IDH2_140IDH2_172IRF1JAK2JARID2KDM5CKDM6A⋯PATIENT_RECEIVED_MORE_THAN_1_DMTPATIENT_RECEIVED_MORE_THAN_1_TPLCENTERCENTER_COHORTBATCHFLAG_KEEP_REASONFLAG_ISSUE_DATEAMLSEX_INFER_SEQUENCINGFLAG_ISSUE_SEXDIAG_PB_BLAST_RANGESAMPLE_PB_BLAST_RANGEDIAG_RINGED_SIDEROBLASTS_RANGEDIAG_BM_BLAST_RANGESAMPLE_RINGED_SIDEROBLASTS_RANGESAMPLE_BM_BLAST_RANGEQC_DETAILEDSELECTED_SAMPLEtime_diag_sample_daystime_diag_sample_rangetype_of_dmtDMT_STRICTis_sample_naivedmt_stageWHO_2008FABPB_BLASTLDHFERRITINWBCANCHBPLTMONOCYTESBM_BLASTCYTOGENETICSNUMBER_OF_METAPHASESIPSSIPSS_RRINGED_SIDEROBLASTSRINGED_SIDEROBLASTS_BINARYRINGED_SIDEROBLASTS_BINARY15WHO_2008_SIMPLIFYtime_diag_dmt_strict_daystime_diag_transplant_daysis_treatedos_diag_yearsos_statusamltime_diag_aml_days
E-H-100000-T1-1-D1-1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 ⋯ NA NA IHBT IHBT_1 B pass NA no F NA <5% NA NA <5% NA NA pass yes 0 < 1 month len yes naive after-sample del(5q) RA 0 7.3 226 7.50 4.65 8.8 406 NA 3 46,xx,del(5)(q15q33.3)[3]/46,xx[1] 22 low good NA NA NA 5q- 2432 NA treated 5.819178 0 no NA
E-H-100001-T1-1-D1-1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ⋯ NA NA IHBT IHBT_1 B pass NA no F NA <5% NA NA <5% NA NA pass yes 0 < 1 month no naive no-treatment RA RA 0 5.0 103 7.21 2.88 9.8 364 NA 2 46,xx,del(5)(q13q33)[6]/47,xx,+8[1]/46,xx[15] 22 int1 good NA NA NA RCUD NA NA not-treated 2.857534 0 no NA
In [27]:
plot_continous_feature_by_components <- function(data, feature_name, scale_y_sqrt) {
    # Plot the distribution of a continuous feature across the components, produce boxplot, violin plot and density plots side by side
    # → Arguments
    #     - data        : dataframe with one row per patient and one column `predicted_component` as well as one column for the continuous feature to plot
    #     - feature_name: name of the continuous feature to plot
    #     - scale_y_sqrt: wheter to plot with a sqrt y scale or not
     
    set_notebook_plot_size(25, 7)
    
    # custom labeller for facet_wrap for density plot
    myLabeller <- function(x) {
        x$predicted_component <- apply(x, 1, function(s) sprintf('Component %s (n = %d)', s, nrow(data[data$predicted_component == s,])))
        return (x)
    }
        
    # remove NA values
    n_initial <- nrow(data)
    data <- data[!is.na(data[feature_name]),]
    
    # boxplot
    p1 <- ggplot(data) +
              geom_boxplot(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7, notch = TRUE) +
              scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
              ggtitle(sprintf('%s density by component\n(removed %d NA values)', feature_name, n_initial - nrow(data))) +
              theme(plot.title = element_text(size = 18, face = "bold"))
    
    # violin plot
    p2 <- ggplot(data) + geom_violin(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7) +
              scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
              ggtitle(' \n ') +
              theme(plot.title = element_text(size = 18, face = "bold"))
    
    # density plot
    p3 <- ggplot(data) + geom_density(aes_string(feature_name, fill = 'predicted_component'), alpha = 0.7) +
              facet_wrap(~ predicted_component, ncol = 4, labeller = myLabeller) +
              scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
              ggtitle(' \n ') +
              theme(plot.title = element_text(size = 18, face = "bold"))
                                       
    if (scale_y_sqrt) {
        p1 <- p1 + scale_y_sqrt()
        p2 <- p2 + scale_y_sqrt()
    }
    
    # plot the three plots side by side and remove ggplot warnings
    suppressWarnings(suppressMessages(grid.arrange(p1, p2, p3, ncol = 3)))
}

# plot_continous_feature_by_components(dd_clinical, 'HB', TRUE)
In [28]:
# plot the distribution of multiple continuous features accross components
for (feature_name in c('HB', 'PLT', 'ANC', 'WBC', 'MONOCYTES', 'PB_BLAST', 'BM_BLAST', 'RINGED_SIDEROBLASTS')) {
    plot_continous_feature_by_components(dd_clinical, feature_name, TRUE)
}
In [29]:
plot_continous_feature_by_components(dd_clinical, 'AGE_AT_SAMPLE_TIME', FALSE)
In [30]:
plot_categorical_feature_by_components <- function(data, feature_name) {
    # Plot the distribution of a categorical feature across the components
    # → Arguments
    #     - data        : dataframe with one row per patient and one column `predicted_component` as well as one column for the categorical feature to plot
    #     - feature_name: name of the categorical feature to plot
    
    set_notebook_plot_size(25, 5)

    # remove NA values
    n_initial <- nrow(data)
    data <- data[!is.na(data[feature_name]),]
    
    # categorical feature distribution for each component
    p1 <- ggplot(data) +
              geom_bar(aes_string('predicted_component', fill = feature_name), position = 'fill') +
              ggtitle(sprintf('%s count and proportion by component\n(removed %d NA values)', feature_name, n_initial - nrow(data))) +
              theme(plot.title = element_text(size = 18, face = "bold"))
    
    # categorical feature distribution by component
    p2 <- ggplot(data) +
              geom_bar(aes_string(feature_name, fill = 'predicted_component'), position = 'fill') +
              scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
              ggtitle(' \n ') +
              theme(plot.title = element_text(size = 18, face = "bold"))
    
    # categorical feature count by component
    p3 <- ggplot(data) +
              geom_bar(aes_string(feature_name, fill = 'predicted_component'), position = 'dodge') +
              scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
              ggtitle(' \n ') + 
              theme(plot.title = element_text(size = 18, face = "bold")) 
    
    # plot the three plots side by side and remove ggplot warnings
    suppressWarnings(suppressMessages(grid.arrange(p1, p2, p3, ncol = 3)))
}

# plot_categorical_feature_by_components(dd_clinical, 'AML')
In [31]:
# plot the distribution of multiple categorical features accross components
for (feature_name in c('AML', 'WHO_2008_SIMPLIFY')) {
    suppressWarnings(plot_categorical_feature_by_components(dd_clinical, feature_name))
}

Survival plots

Work in progress...

In [32]:
set_notebook_plot_size(10, 10)
par(mar = c(3.0, 1.0, 1.0, 1.0))
plot(prodlim(Hist(os_diag_years, os_status) ~ predicted_component, data = dd_clinical[dd_clinical$predicted_component %in% 1:9,], conf.int = FALSE), , col = brewer.pal(10, 'Spectral'))
plot(prodlim(Hist(os_diag_years, os_status) ~ predicted_component, data = dd_clinical[dd_clinical$predicted_component %in% c(NaN),], conf.int = FALSE), , col = 'grey', add = TRUE)
In [33]:
print_p_value_from_survival_curve <- function(data, column_name, value_1_name, value_2_name) {
    # work in progress...
    stmp <- survdiff(as.formula(sprintf('Surv(os_diag_years, os_status) ~ %s', column_name)), data = data[data[,column_name] %in% c(value_1_name, value_2_name),])
    pval <- 1 - pchisq(stmp$chisq, length(stmp$n) - 1)
    
    print_and_flush(sprintf('%-21s and %21s: p = %.3f\n', value_1_name, value_2_name, pval))
}

plot_survival_curve <- function(dd_clinical, category_name, component) {
    # work in progress...
    
    display_markdown(sprintf('**%s on components %s**', category_name, paste(component, collapse = ', ')))
    
    extra_col_name <- sprintf('%s_component', category_name)
    
    dd_clinical[extra_col_name] <- 'wildtype'
    dd_clinical[dd_clinical[category_name] == 1, extra_col_name] <- sprintf('%s_other', category_name)
    
    for (i in component) 
        dd_clinical[dd_clinical[category_name] == 1 & dd_clinical$predicted_component == i, extra_col_name] <- sprintf('%s_component_%d', category_name, i)
        
    
    print_and_flush('## count table\n')
    print(get_table(dd_clinical[extra_col_name], add_total_count = FALSE), row.names = FALSE)
    print_and_flush('\n')
    
    
    print_and_flush('## p-values\n')
    categories <- as.character(get_table(dd_clinical[extra_col_name], add_total_count = FALSE)$values)
    pairs <- combn(categories, 2)
    
    for (i in seq(1, length(pairs), 2))
        print_p_value_from_survival_curve(dd_clinical, extra_col_name, pairs[i], pairs[i + 1])
    
    print_and_flush('\n')
        
    
    set_notebook_plot_size(7, 7)
    par(mar = c(3.0, 1.0, 1.0, 1.0))
    
    sr <- prodlim(as.formula(sprintf('Hist(os_diag_years, os_status) ~ %s', extra_col_name)), data = dd_clinical)
    plot(sr, confint = FALSE, col = brewer.pal(length(categories), 'Spectral'), legend.title = '', legend.x = 'topright')
    
    Sys.sleep(0.2)
}
In [34]:
plot_survival_curve(dd_clinical, 'SF3B1', c(4, 6))

SF3B1 on components 4, 6

## count table
            values count  freq
          wildtype  2564 77.7%
 SF3B1_component_4   410 12.4%
       SF3B1_other   212  6.4%
 SF3B1_component_6   114  3.5%

## p-values
wildtype              and     SF3B1_component_4: p = 0.000
wildtype              and           SF3B1_other: p = 0.163
wildtype              and     SF3B1_component_6: p = 0.001
SF3B1_component_4     and           SF3B1_other: p = 0.000
SF3B1_component_4     and     SF3B1_component_6: p = 0.038
SF3B1_other           and     SF3B1_component_6: p = 0.016

In [35]:
plot_survival_curve(dd_clinical, 'SRSF2', c(1, 3, 5))

SRSF2 on components 1, 3, 5

## count table
            values count  freq
          wildtype  2731 82.8%
 SRSF2_component_3   274  8.3%
 SRSF2_component_1   202  6.1%
 SRSF2_component_5    48  1.5%
       SRSF2_other    45  1.4%

## p-values
wildtype              and     SRSF2_component_3: p = 0.000
wildtype              and     SRSF2_component_1: p = 0.798
wildtype              and     SRSF2_component_5: p = 0.000
wildtype              and           SRSF2_other: p = 0.020
SRSF2_component_3     and     SRSF2_component_1: p = 0.000
SRSF2_component_3     and     SRSF2_component_5: p = 0.108
SRSF2_component_3     and           SRSF2_other: p = 0.611
SRSF2_component_1     and     SRSF2_component_5: p = 0.000
SRSF2_component_1     and           SRSF2_other: p = 0.025
SRSF2_component_5     and           SRSF2_other: p = 0.075

In [36]:
plot_survival_curve(dd_clinical, 'U2AF1_157', c(5, 7))

U2AF1_157 on components 5, 7

## count table
                values count  freq
              wildtype  3138 95.1%
 U2AF1_157_component_7    63  1.9%
 U2AF1_157_component_5    50  1.5%
       U2AF1_157_other    49  1.5%

## p-values
wildtype              and U2AF1_157_component_7: p = 0.375
wildtype              and U2AF1_157_component_5: p = 0.000
wildtype              and       U2AF1_157_other: p = 0.012
U2AF1_157_component_7 and U2AF1_157_component_5: p = 0.124
U2AF1_157_component_7 and       U2AF1_157_other: p = 0.438
U2AF1_157_component_5 and       U2AF1_157_other: p = 0.619

In [37]:
plot_survival_curve(dd_clinical, 'U2AF1_34', c(8))

U2AF1_34 on components 8

## count table
               values count  freq
             wildtype  3184 96.5%
 U2AF1_34_component_8    95  2.9%
       U2AF1_34_other    21  0.6%

## p-values
wildtype              and  U2AF1_34_component_8: p = 0.032
wildtype              and        U2AF1_34_other: p = 0.001
U2AF1_34_component_8  and        U2AF1_34_other: p = 0.054

In [38]:
plot_survival_curve(dd_clinical, 'ZRSR2', c(1, 7))

ZRSR2 on components 1, 7

## count table
            values count  freq
          wildtype  3120 94.5%
 ZRSR2_component_1    95  2.9%
 ZRSR2_component_7    56  1.7%
       ZRSR2_other    29  0.9%

## p-values
wildtype              and     ZRSR2_component_1: p = 0.013
wildtype              and     ZRSR2_component_7: p = 0.009
wildtype              and           ZRSR2_other: p = 0.861
ZRSR2_component_1     and     ZRSR2_component_7: p = 0.000
ZRSR2_component_1     and           ZRSR2_other: p = 0.202
ZRSR2_component_7     and           ZRSR2_other: p = 0.145

In [39]:
plot_survival_curve(dd_clinical, 'del5q', c(2, 6))

del5q on components 2, 6

## count table
            values count freq
          wildtype  2839  86%
 del5q_component_2   217 6.6%
 del5q_component_6   166   5%
       del5q_other    78 2.4%

## p-values
wildtype              and     del5q_component_2: p = 0.000
wildtype              and     del5q_component_6: p = 0.005
wildtype              and           del5q_other: p = 0.590
del5q_component_2     and     del5q_component_6: p = 0.000
del5q_component_2     and           del5q_other: p = 0.000
del5q_component_6     and           del5q_other: p = 0.027

In [40]:
plot_survival_curve(dd_clinical, 'TP53', c(2, 6))

TP53 on components 2, 6

## count table
           values count  freq
         wildtype  2918 88.4%
 TP53_component_2   252  7.6%
       TP53_other    74  2.2%
 TP53_component_6    56  1.7%

## p-values
wildtype              and      TP53_component_2: p = 0.000
wildtype              and            TP53_other: p = 0.000
wildtype              and      TP53_component_6: p = 0.479
TP53_component_2      and            TP53_other: p = 0.000
TP53_component_2      and      TP53_component_6: p = 0.000
TP53_other            and      TP53_component_6: p = 0.005